/*
This file generates the variables used in the paper, from the dataset stemming from the HERO survey/sampling project.

This is run from other files, thus no CDs or global var defs.
*/

use "${dir_data}/final_sample.dta", clear

  preserve
    tempfile census
    use "${dir_data}/census_clean", clear
      gen blkgroup_population = AJWNE001
      collapse (sum) population_per_county=blkgroup_population (mean) population_per_household=AJ19E001, by(COUNTY)
      keep COUNTY population_per_county population_per_household
    save "`census'"
  restore
merge m:1 COUNTY using "`census'", keep(match master) nogenerate


*Start with household data
keep if COUNTY == "Salt Lake County" | COUNTY == "Summit County" | COUNTY == "Utah County" | COUNTY == "Davis County"

bys house_id: gen unique_house = (_n ==1)
bys COUNTY: gen unique_county = (_n ==1)
bys geoid_blkgroup: gen unique_blkgroup = (_n ==1)
bys SmallAreaN: gen unique_smol = (_n ==1)

***Census characteristics
******1) Population
    label variable population_per_county "Population"

******2) Household Population
    label variable population_per_household "Household Population"


******3) Median Age
    gen blk_med_age = median_age if unique_blkgroup == 1
    bysort COUNTY: egen mean_county_median_age = mean(blk_med_age)
    drop blk_med_age
    label variable mean_county_median_age "Median Age"

******4) % Hispanic
    gen num_hispanic = perc_hispanic * blkgroup_population if unique_blkgroup == 1
    bysort COUNTY: egen population_hisp_per_county = total(num_hispanic)
    gen perc_hispanic_by_county = 100 * population_hisp_per_county / population_per_county
    drop num_hispanic population_hisp_per_county
    label variable perc_hispanic_by_county "\% Hispanic"


******5) Reported Prevalence (5/7/2020)
    /*
    merged coronavirus dashboard data:
    date,county,state,fips,cases,deaths
    2020-05-07,Salt Lake,Utah,49035,3010,39
    2020-05-07,Utah,Utah,49049,1219,11
    2020-05-07,Davis,Utah,49011,315,2
    2020-05-07,Summit,Utah,49043,382,0
     */
    gen county_rate_5_7 = .
    replace county_rate_5_7 = 3010 if COUNTY == "Salt Lake County"
    replace county_rate_5_7 = 1219 if COUNTY == "Utah County"
    replace county_rate_5_7 =  315 if COUNTY == "Davis County"
    replace county_rate_5_7 =  382 if COUNTY == "Summit County"
    gen mean_county_rate_5_7 = county_rate_5_7 / (population_per_county / 100000)
    label variable mean_county_rate_5_7 "Reported Prevalence (5/7/2020)"

    gen county_deaths_5_7 = .
    replace county_deaths_5_7 = 39 if COUNTY == "Salt Lake County"
    replace county_deaths_5_7 = 11 if COUNTY == "Utah County"
    replace county_deaths_5_7 =  2 if COUNTY == "Davis County"
    replace county_deaths_5_7 =  0 if COUNTY == "Summit County"
    label variable county_deaths_5_7 "Reported Deaths (5/7/2020)"


***7-9) Household and individual counts
    gen sampled_individual = inlist(phase1_design, 1, 2, 5, 6) | (phase2 == 1)
    gen in_sample_individual = (sampled_individual == 1) ///
                                & (!missing(race___1) | (!missing(collection_date_arup) & (collection_date_arup < tc(01july2020 23:55))))
    gen is_tested_individual = (sampled_individual == 1) & (collection_date_arup < tc(01july2020 23:55))
    gen cia_individual = !missing(cia_pos) & (is_tested_individual==1)
    gen pcr_individual = !missing(pcr_pos) & (is_tested_individual==1)

    bysort house_id: egen sampled_house = max(sampled_individual)
      replace sampled_house = . if unique_house != 1
    bysort house_id: egen in_sample_house = max(in_sample_individual)
      replace in_sample_house = . if unique_house != 1
    bysort house_id: egen cia_house = max(cia_individual)
      replace cia_house = . if unique_house != 1
    bysort house_id: egen pcr_house = max(pcr_individual)
      replace pcr_house = . if unique_house != 1

    capture drop any_test
    gen any_test = (cia_individual==1) | (pcr_individual==1)

    bysort COUNTY: egen households_sampled = total(sampled_house)
    bysort COUNTY: egen in_sample_households = total(in_sample_house)
    bysort COUNTY: egen cia_households = total(cia_house)
    bysort COUNTY: egen pcr_households = total(pcr_house)
    bysort COUNTY: egen in_sample_individuals = total(in_sample_individual)
    bysort COUNTY: egen cia_individuals = total(cia_individual)
    bysort COUNTY: egen pcr_individuals = total(pcr_individual)
    label variable households_sampled "Households Sampled"
    label variable in_sample_households "Households In Sample"
    label variable cia_households "Households with Antibody Test"
    label variable pcr_households "Households with Viral Test"
    label variable in_sample_individuals "Individuals In Sample"
    label variable cia_individuals "Individuals with Antibody Test"
    label variable pcr_individuals "Individuals with Viral Test"


  ******10) Median Age
    bysort COUNTY: egen county_median_age = median(age) if in_sample_individual==1
    label variable county_median_age "Median Age"


  ******11) %Hispanic
    bysort COUNTY: egen sample_perc_hispanic = mean(ethnicity) if in_sample_individual==1
    replace sample_perc_hispanic = sample_perc_hispanic * 100
    label variable sample_perc_hispanic "\% Hispanic"


  ******12) % Male sex 1 male 2 female 3 (other)
    gen is_female = sex == 2 if !missing(sex)
    bysort COUNTY: egen sample_perc_female = mean(is_female) if in_sample_individual==1
    replace sample_perc_female = sample_perc_female * 100
    label variable sample_perc_female "\% Female"

   *****14) concern
    gen is_concerened = (covid_concern == 3) if !missing(covid_concern) & (in_sample_individual==1)
    bysort COUNTY: egen sample_perc_concern = mean(is_concerened) if in_sample_individual==1
    replace sample_perc_concern = sample_perc_concern * 100
    label variable sample_perc_concern "\% Very Concerned"


   *****15) viral prevalence
    bysort COUNTY: egen pcr_county_mean = mean(pcr_pos) if in_sample_individual==1
    replace pcr_county_mean = pcr_county_mean * 100
    label variable pcr_county_mean "Viral Prevalence"

   *****16) anitbody prevalence
    bysort COUNTY: egen cia_county_mean = mean(cia_pos) if in_sample_individual==1
    replace cia_county_mean = cia_county_mean * 100
    label variable cia_county_mean "Antibody Prevalence"

** Additional variables for weighting functions
  gen one = 1
  gen opp = (cia_pos == 0) & (in_sample_individual == 1)
  gen any_pos = 0 if (any_test == 1)
  replace any_pos = 1 if (pcr_pos == 1) & (any_test == 1)
  replace any_pos = 1 if (cia_pos == 1) & (any_test == 1)

  *generate smooth probabilities within stratum
  levelsof stratum_phase1, local(levels)
  gen num_groups_selected = .
  foreach var of local levels {
    distinct geoid_tractgroup if (phase1_design == 1 | phase1_design == 5) & stratum_phase1 == "`var'"
    replace num_groups_selected = r(ndistinct) if stratum_phase1 == "`var'"
  }
  gen smooth_addr_cond_tgrp = num_groups_selected * 210/ num_viable_addr_in_stratum

  *Park city had 63 blocks chosen
    replace smooth_addr_cond_tgrp = 441/ num_viable_addr_in_stratum if location_name_phase1 == "park city"
  *Davis county had 45 blocks chosen
    replace smooth_addr_cond_tgrp = 315/ num_viable_addr_in_stratum if location_name_phase1 == "farmington"

  *generate smooth probabilities
  gen smooth_prob_addr = prob_tgrp_phase1*smooth_addr_cond_tgrp

  *generate smooth probabilities for phase2
  bysort house_id: gen phase2_house_counter = cond(_n==1, phase2, 0)
  bysort stratum_phase2: egen num_addr_stratum_selected2 = total(phase2_house_counter)

  gen smooth_prob_addr2 = num_addr_stratum_selected2/num_addr_in_stratum_phase2

  *generate combined probability
  gen smooth_prob_total = smooth_prob_addr + (1 - smooth_prob_addr) * smooth_prob_addr2
  gen w_together_smooth =1/smooth_prob_total
